Spaces:

stamps-labs
/

stamp2vec

Runtime error

App Files Files Community

sadjava commited on Jul 9, 2023

Commit

479c88d

1 Parent(s): c0e0595

Add app.py

Browse files

Files changed (7) hide show

app.py +102 -0
constants.py +33 -0
examples/1.jpg +0 -0
examples/2.jpg +0 -0
examples/3.jpg +0 -0
models.py +135 -0
utils.py +250 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gradio as gr
+import numpy as np
+from ultralytics import YOLO
+from torchvision.transforms.functional import to_tensor
+from huggingface_hub import hf_hub_download
+import torch
+import albumentations as A
+from albumentations.pytorch.transforms import ToTensorV2
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from utils import *
+from models import YOLOStamp, Encoder
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+yolov8 = YOLO(hf_hub_download('stamps-labs/yolov8-finetuned', filename='best.torchscript'), task='detect')
+yolo_stamp = YOLOStamp()
+yolo_stamp.load_state_dict(torch.load(hf_hub_download('stamps-labs/yolo-stamp', filename='state_dict.pth')))
+yolo_stamp = yolo_stamp.to(device)
+yolo_stamp.eval()
+transform = A.Compose([
+    A.Normalize(),
+    ToTensorV2(p=1.0),
+])
+vits8 = torch.jit.load(hf_hub_download('stamps-labs/vits8-stamp', filename='vits8stamp-torchscript.pth'))
+vits8 = vits8.to(device)
+vits8.eval()
+encoder = Encoder()
+encoder.load_state_dict(torch.load(hf_hub_download('stamps-labs/vae-encoder', filename='encoder.pth')))
+encoder = encoder.to(device)
+encoder.eval()
+def predict(image, det_choice, emb_choice):
+    shape = torch.tensor(image.size)
+    image = image.convert('RGB')
+    if det_choice == 'yolov8':
+        coef = torch.hstack((shape, shape)) / 640
+        image = image.resize((640, 640))
+        boxes = yolov8(image)[0].boxes.xyxy.cpu()
+        image_with_boxes = visualize_bbox(image, boxes)
+    elif det_choice == 'yolo-stamp':
+        coef = torch.hstack((shape, shape)) / 448
+        image = image.resize((448, 448))
+        image_tensor = transform(image=np.array(image))['image']
+        output = yolo_stamp(image_tensor.unsqueeze(0).to(device))
+        boxes = output_tensor_to_boxes(output[0].detach().cpu())
+        boxes = nonmax_suppression(boxes)
+        boxes = xywh2xyxy(torch.tensor(boxes)[:, :4])
+        image_with_boxes = visualize_bbox(image, boxes)
+    else:
+        return
+    embeddings = []
+    if emb_choice == 'vits8':
+        for box in boxes:
+            cropped_stamp = to_tensor(image.crop(box.tolist()))
+            embeddings.append(vits8(cropped_stamp.unsqueeze(0).to(device))[0].detach().cpu())
+    elif emb_choice == 'vae-encoder':
+        for box in boxes:
+            cropped_stamp = to_tensor(image.crop(box.tolist()).resize((118, 118)))
+            embeddings.append(encoder(cropped_stamp.unsqueeze(0).to(device))[0][0].detach().cpu())
+    embeddings = np.stack(embeddings)
+    similarities = cosine_similarity(embeddings)
+    boxes = boxes * coef
+    df_boxes = pd.DataFrame(boxes, columns=['x1', 'y1', 'x2', 'y2'])
+    fig, ax = plt.subplots()
+    im, cbar = heatmap(similarities, range(1, len(embeddings) + 1), range(1, len(embeddings) + 1), ax=ax,
+                    cmap="YlGn", cbarlabel="Embeddings similarities")
+    texts = annotate_heatmap(im, valfmt="{x:.3f}")
+    return image_with_boxes, df_boxes, embeddings, fig
+examples = [['./examples/1.jpg', 'yolov8', 'vits8'], ['./examples/2.jpg', 'yolov8', 'vae-encoder'], ['./examples/3.jpg', 'yolo-stamp', 'vits8']]
+inputs = [
+    gr.Image(type="pil"),
+    gr.Dropdown(choices=['yolov8', 'yolo-stamp'], value='yolov8', label='Detection model'),
+    gr.Dropdown(choices=['vits8', 'vae-encoder'], value='vits8', label='Embedding model'),
+]
+outputs = [
+    gr.Image(type="pil"),
+    gr.DataFrame(type='pandas', label="Bounding boxes"),
+    gr.DataFrame(type='numpy', label="Embeddings"),
+    gr.Plot(type='numpy', label="Cosine Similarities")
+]
+app = gr.Interface(predict, inputs, outputs, examples=examples)
+app.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# shape of input image to YOLO
+W, H =  448, 448
+# grid size after last convolutional layer of YOLO
+S = 7
+# anchors of YOLO model
+ANCHORS = [[1.5340836003942058, 1.258424277571925],
+ [1.4957766780406023, 2.2319885681948217],
+ [1.2508985343739407, 0.8233350471152914]]
+# number of anchors boxes
+BOX = len(ANCHORS)
+# maximum number of stamps on image
+STAMP_NB_MAX = 10
+# minimal confidence of presence a stamp in the grid cell
+OUTPUT_THRESH = 0.7
+# maximal iou score to consider boxes different
+IOU_THRESH = 0.3
+# path to folder containing images
+IMAGE_FOLDER = './data/images'
+# path to .cvs file containing annotations
+ANNOTATIONS_PATH = './data/all_annotations.csv'
+# standard deviation and mean of pixel values for normalization
+STD = (0.229, 0.224, 0.225)
+MEAN = (0.485, 0.456, 0.406)
+# box color to show the bounding box on image
+BOX_COLOR = (0, 0, 255)
+# dimenstion of image embedding
+Z_DIM = 128
+# hidden dimensions for encoder model
+ENC_HIDDEN_DIM = 16
+# hidden dimensions for decoder model
+DEC_HIDDEN_DIM = 64

examples/1.jpg ADDED Viewed

examples/2.jpg ADDED Viewed

examples/3.jpg ADDED Viewed

models.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+import torch.nn as nn
+from constants import *
+"""
+    Class for custom activation.
+"""
+class SymReLU(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+    def forward(self, input):
+        return torch.min(torch.max(input, -torch.ones_like(input)), torch.ones_like(input))
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+"""
+    Class implementing YOLO-Stamp architecture described in https://link.springer.com/article/10.1134/S1054661822040046.
+"""
+class YOLOStamp(nn.Module):
+    def __init__(
+            self,
+            anchors=ANCHORS,
+            in_channels=3,
+    ):
+        super().__init__()
+        self.register_buffer('anchors', torch.tensor(anchors))
+        self.act = SymReLU()
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm1 = nn.BatchNorm2d(num_features=8)
+        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm2 = nn.BatchNorm2d(num_features=16)
+        self.conv3 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm3 = nn.BatchNorm2d(num_features=16)
+        self.conv4 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm4 = nn.BatchNorm2d(num_features=16)
+        self.conv5 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm5 = nn.BatchNorm2d(num_features=16)
+        self.conv6 = nn.Conv2d(in_channels=16, out_channels=24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm6 = nn.BatchNorm2d(num_features=24)
+        self.conv7 = nn.Conv2d(in_channels=24, out_channels=24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm7 = nn.BatchNorm2d(num_features=24)
+        self.conv8 = nn.Conv2d(in_channels=24, out_channels=48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm8 = nn.BatchNorm2d(num_features=48)
+        self.conv9 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm9 = nn.BatchNorm2d(num_features=48)
+        self.conv10 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm10 = nn.BatchNorm2d(num_features=48)
+        self.conv11 = nn.Conv2d(in_channels=48, out_channels=64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.norm11 = nn.BatchNorm2d(num_features=64)
+        self.conv12 = nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
+        self.norm12 = nn.BatchNorm2d(num_features=256)
+        self.conv13 = nn.Conv2d(in_channels=256, out_channels=len(anchors) * 5, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
+    def forward(self, x, head=True):
+        x = x.type(self.conv1.weight.dtype)
+        x = self.act(self.pool(self.norm1(self.conv1(x))))
+        x = self.act(self.pool(self.norm2(self.conv2(x))))
+        x = self.act(self.pool(self.norm3(self.conv3(x))))
+        x = self.act(self.pool(self.norm4(self.conv4(x))))
+        x = self.act(self.pool(self.norm5(self.conv5(x))))
+        x = self.act(self.norm6(self.conv6(x)))
+        x = self.act(self.norm7(self.conv7(x)))
+        x = self.act(self.pool(self.norm8(self.conv8(x))))
+        x = self.act(self.norm9(self.conv9(x)))
+        x = self.act(self.norm10(self.conv10(x)))
+        x = self.act(self.norm11(self.conv11(x)))
+        x = self.act(self.norm12(self.conv12(x)))
+        x = self.conv13(x)
+        nb, _, nh, nw= x.shape
+        x = x.permute(0, 2, 3, 1).view(nb, nh, nw, self.anchors.shape[0], 5)
+        return x
+class Encoder(torch.nn.Module):
+    '''
+    Encoder Class
+    Values:
+    im_chan: the number of channels of the output image, a scalar
+    hidden_dim: the inner dimension, a scalar
+    '''
+    def __init__(self, im_chan=3, output_chan=Z_DIM, hidden_dim=ENC_HIDDEN_DIM):
+        super(Encoder, self).__init__()
+        self.z_dim = output_chan
+        self.disc = torch.nn.Sequential(
+            self.make_disc_block(im_chan, hidden_dim),
+            self.make_disc_block(hidden_dim, hidden_dim * 2),
+            self.make_disc_block(hidden_dim * 2, hidden_dim * 4),
+            self.make_disc_block(hidden_dim * 4, hidden_dim * 8),
+            self.make_disc_block(hidden_dim * 8, output_chan * 2, final_layer=True),
+        )
+    def make_disc_block(self, input_channels, output_channels, kernel_size=4, stride=2, final_layer=False):
+        '''
+        Function to return a sequence of operations corresponding to a encoder block of the VAE,
+        corresponding to a convolution, a batchnorm (except for in the last layer), and an activation
+        Parameters:
+        input_channels: how many channels the input feature representation has
+        output_channels: how many channels the output feature representation should have
+        kernel_size: the size of each convolutional filter, equivalent to (kernel_size, kernel_size)
+        stride: the stride of the convolution
+        final_layer: whether we're on the final layer (affects activation and batchnorm)
+        '''
+        if not final_layer:
+            return torch.nn.Sequential(
+                torch.nn.Conv2d(input_channels, output_channels, kernel_size, stride),
+                torch.nn.BatchNorm2d(output_channels),
+                torch.nn.LeakyReLU(0.2, inplace=True),
+            )
+        else:
+            return torch.nn.Sequential(
+                torch.nn.Conv2d(input_channels, output_channels, kernel_size, stride),
+            )
+    def forward(self, image):
+        '''
+        Function for completing a forward pass of the Encoder: Given an image tensor,
+        returns a 1-dimension tensor representing fake/real.
+        Parameters:
+        image: a flattened image tensor with dimension (im_dim)
+        '''
+        disc_pred = self.disc(image)
+        encoding = disc_pred.view(len(disc_pred), -1)
+        # The stddev output is treated as the log of the variance of the normal
+        # distribution by convention and for numerical stability
+        return encoding[:, :self.z_dim], encoding[:, self.z_dim:].exp()

utils.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from PIL import Image, ImageDraw
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib
+from constants import *
+def visualize_bbox(image: Image, prediction):
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    for i, box in enumerate(prediction):
+        x1, y1, x2, y2 = box.cpu()
+        draw = ImageDraw.Draw(img)
+        text_w, text_h = draw.textsize(str(i + 1))
+        label_y = y1 if y1 <= text_h else y1 - text_h
+        draw.rectangle((x1, y1, x2, y2), outline='red')
+        draw.rectangle((x1, label_y, x1+text_w, label_y+text_h), outline='red', fill='red')
+        draw.text((x1, label_y), str(i + 1), fill='white')
+    return img
+def xywh2xyxy(x):
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = x[..., 0]
+    y[..., 1] = x[..., 1]
+    y[..., 2] = x[..., 0] + x[..., 2]
+    y[..., 3] = x[..., 1] + x[..., 3]
+    return y
+def output_tensor_to_boxes(boxes_tensor):
+    """
+        Converts the YOLO output tensor to list of boxes with probabilites.
+        Arguments:
+        boxes_tensor -- tensor of shape (S, S, BOX, 5)
+        Returns:
+        boxes -- list of shape (None, 5)
+        Note: "None" is here because you don't know the exact number of selected boxes, as it depends on the threshold.
+        For example, the actual output size of scores would be (10, 5) if there are 10 boxes
+    """
+    cell_w, cell_h = W/S, H/S
+    boxes = []
+    for i in range(S):
+        for j in range(S):
+            for b in range(BOX):
+                anchor_wh = torch.tensor(ANCHORS[b])
+                data = boxes_tensor[i,j,b]
+                xy = torch.sigmoid(data[:2])
+                wh = torch.exp(data[2:4])*anchor_wh
+                obj_prob = torch.sigmoid(data[4])
+                if obj_prob > OUTPUT_THRESH:
+                    x_center, y_center, w, h = xy[0], xy[1], wh[0], wh[1]
+                    x, y = x_center+j-w/2, y_center+i-h/2
+                    x,y,w,h = x*cell_w, y*cell_h, w*cell_w, h*cell_h
+                    box = [x,y,w,h, obj_prob]
+                    boxes.append(box)
+    return boxes
+def overlap(interval_1, interval_2):
+    """
+        Calculates length of overlap between two intervals.
+        Arguments:
+        interval_1 -- list or tuple of shape (2,) containing endpoints of the first interval
+        interval_2 -- list or tuple of shape (2, 2) containing endpoints of the second interval
+        Returns:
+        overlap -- length of overlap
+    """
+    x1, x2 = interval_1
+    x3, x4 = interval_2
+    if x3 < x1:
+        if x4 < x1:
+            return 0
+        else:
+            return min(x2,x4) - x1
+    else:
+        if x2 < x3:
+            return 0
+        else:
+            return min(x2,x4) - x3
+def compute_iou(box1, box2):
+    """
+        Compute IOU between box1 and box2.
+        Argmunets:
+        box1 -- list of shape (5, ). Represents the first box
+        box2 -- list of shape (5, ). Represents the second box
+        Each box is [x, y, w, h, prob]
+        Returns:
+        iou -- intersection over union score between two boxes
+    """
+    x1,y1,w1,h1 = box1[0], box1[1], box1[2], box1[3]
+    x2,y2,w2,h2 = box2[0], box2[1], box2[2], box2[3]
+    area1, area2 = w1*h1, w2*h2
+    intersect_w = overlap((x1,x1+w1), (x2,x2+w2))
+    intersect_h = overlap((y1,y1+h1), (y2,y2+w2))
+    if intersect_w == w1 and intersect_h == h1 or intersect_w == w2 and intersect_h == h2:
+        return 1.
+    intersect_area = intersect_w*intersect_h
+    iou = intersect_area/(area1 + area2 - intersect_area)
+    return iou
+def nonmax_suppression(boxes, iou_thresh = IOU_THRESH):
+    """
+        Removes ovelap bboxes
+        Arguments:
+        boxes -- list of shape (None, 5)
+        iou_thresh -- maximal value of iou when boxes are considered different
+        Each box is [x, y, w, h, prob]
+        Returns:
+        boxes -- list of shape (None, 5) with removed overlapping boxes
+    """
+    boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
+    for i, current_box in enumerate(boxes):
+        if current_box[4] <= 0:
+            continue
+        for j in range(i+1, len(boxes)):
+            iou = compute_iou(current_box, boxes[j])
+            if iou > iou_thresh:
+                boxes[j][4] = 0
+    boxes = [box for box in boxes if box[4] > 0]
+    return boxes
+def heatmap(data, row_labels, col_labels, ax=None,
+            cbar_kw=None, cbarlabel="", **kwargs):
+    """
+    Create a heatmap from a numpy array and two lists of labels.
+    Parameters
+    ----------
+    data
+        A 2D numpy array of shape (M, N).
+    row_labels
+        A list or array of length M with the labels for the rows.
+    col_labels
+        A list or array of length N with the labels for the columns.
+    ax
+        A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
+        not provided, use current axes or create a new one.  Optional.
+    cbar_kw
+        A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
+    cbarlabel
+        The label for the colorbar.  Optional.
+    **kwargs
+        All other arguments are forwarded to `imshow`.
+    """
+    if ax is None:
+        ax = plt.gca()
+    if cbar_kw is None:
+        cbar_kw = {}
+    # Plot the heatmap
+    im = ax.imshow(data, **kwargs)
+    # Create colorbar
+    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
+    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
+    # Show all ticks and label them with the respective list entries.
+    ax.set_xticks(np.arange(data.shape[1]), labels=col_labels)
+    ax.set_yticks(np.arange(data.shape[0]), labels=row_labels)
+    # Let the horizontal axes labeling appear on top.
+    ax.tick_params(top=True, bottom=False,
+                   labeltop=True, labelbottom=False)
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
+             rotation_mode="anchor")
+    # Turn spines off and create white grid.
+    ax.spines[:].set_visible(False)
+    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
+    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
+    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
+    ax.tick_params(which="minor", bottom=False, left=False)
+    return im, cbar
+def annotate_heatmap(im, data=None, valfmt="{x:.2f}",
+                     textcolors=("black", "white"),
+                     threshold=None, **textkw):
+    """
+    A function to annotate a heatmap.
+    Parameters
+    ----------
+    im
+        The AxesImage to be labeled.
+    data
+        Data used to annotate.  If None, the image's data is used.  Optional.
+    valfmt
+        The format of the annotations inside the heatmap.  This should either
+        use the string format method, e.g. "$ {x:.2f}", or be a
+        `matplotlib.ticker.Formatter`.  Optional.
+    textcolors
+        A pair of colors.  The first is used for values below a threshold,
+        the second for those above.  Optional.
+    threshold
+        Value in data units according to which the colors from textcolors are
+        applied.  If None (the default) uses the middle of the colormap as
+        separation.  Optional.
+    **kwargs
+        All other arguments are forwarded to each call to `text` used to create
+        the text labels.
+    """
+    if not isinstance(data, (list, np.ndarray)):
+        data = im.get_array()
+    # Normalize the threshold to the images color range.
+    if threshold is not None:
+        threshold = im.norm(threshold)
+    else:
+        threshold = im.norm(data.max())/2.
+    # Set default alignment to center, but allow it to be
+    # overwritten by textkw.
+    kw = dict(horizontalalignment="center",
+              verticalalignment="center")
+    kw.update(textkw)
+    # Get the formatter in case a string is supplied
+    if isinstance(valfmt, str):
+        valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)
+    # Loop over the data and create a `Text` for each "pixel".
+    # Change the text's color depending on the data.
+    texts = []
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1]):
+            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
+            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
+            texts.append(text)
+    return texts