Spaces:

5m4ck3r
/

SelectByText

Running

File size: 4,388 Bytes

0a275ec
 
f5f8296
c0c08a7
 
 
e14c9b5
c0c08a7
 
e7de587
c0c08a7
 
 
 
e14c9b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfc3574
 
 
c50f5c0
dfc3574
 
c50f5c0
 
e14c9b5
 
 
 
 
c0c08a7
 
e14c9b5
c0c08a7
 
 
e14c9b5
c0c08a7
e14c9b5
 
c0c08a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7de587
322cdde
e14c9b5
 
c0c08a7
 
a7ce092
c0c08a7
 
 
 
c1c2031
 
c0c08a7
ed9d9fa

import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.47.1")
from transformers import pipeline
import gradio
import base64
from PIL import Image, ImageDraw
from io import BytesIO
from sentence_transformers import SentenceTransformer, util
import spaces

backgroundPipe = pipeline("image-segmentation", model="facebook/maskformer-swin-large-coco")
PersonPipe = pipeline("image-segmentation", model="mattmdjaga/segformer_b2_clothes")
sentenceModal = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
personDetailsPipe = pipeline("image-segmentation", model="yolo12138/segformer-b2-human-parse-24")
faceModal = pipeline("image-segmentation", model="jonathandinu/face-parsing")
faceDetectionModal = pipeline("object-detection", model="aditmohan96/detr-finetuned-face")
PersonDetectionpipe = pipeline("object-detection", model="hustvl/yolos-tiny")

def getPersonDetail(image):
    data = PersonDetectionpipe(image)
    persn = []
    for per in data:
        if per["label"].lower() == "person":
            persn.append(per["box"])
    n = 1
    ret = {}
    for cord in persn:
        crop_box = (cord['xmin'], cord['ymin'], cord['xmax'], cord['ymax'])
        cropped_image = image.crop(crop_box)
        personData = personDetailsPipe(cropped_image)
        for dt in personData:
            if len(persn) > 1:
                ret[(f'Person {n} {dt["label"]}').lower()] = cbiwm(image, dt["mask"], cord)
            else:
                ret[dt["label"].lower()] = cbiwm(image, dt["mask"], cord)
        n = n + 1
    return ret

def cbiwm(image, mask, coordinates):
    black_image = Image.new("RGBA", image.size, (0, 0, 0, 255))
    black_image.paste(mask, (coordinates['xmin'], coordinates['ymin']), mask)
    return black_image

def processFaceDetails(image):
    ret = getPersonDetail(image)
    data = faceDetectionModal(image)
    if len(data) > 1:
        cordinates = data[1]["box"]
        crop_box = (data[1]["box"]['xmin'], data[1]["box"]['ymin'], data[1]["box"]['xmax'], data[1]["box"]['ymax'])
    elif len(data) > 0:
        cordinates = data[0]["box"]
        crop_box = (data[0]["box"]['xmin'], data[0]["box"]['ymin'], data[0]["box"]['xmax'], data[0]["box"]['ymax'])
    else:
        return ret
    cropped_image = image.crop(crop_box)
    facedata = faceModal(cropped_image)
    for imask in facedata:
        ret[imask["label"].replace(".png", "").lower()] = cbiwm(image, imask["mask"], cordinates)
    return ret

def getImageDetails(image) -> dict:
    ret = processFaceDetails(image)
    person = PersonPipe(image)
    bg = backgroundPipe(image)
    for imask in bg:
        ret[imask["label"].lower()] = imask["mask"] # Apply base64 image converter here if needed
    for mask in person:
        ret[mask["label"].lower()] = mask["mask"] # Apply base64 image converter here if needed
    return ret

def processSentence(sentence: str, semilist: list):
    query_embedding = sentenceModal.encode(sentence)
    passage_embedding = sentenceModal.encode(semilist)
    listv = util.dot_score(query_embedding, passage_embedding)[0]
    float_list = []
    for i in listv:
        float_list.append(i)
    max_value = max(float_list)
    max_index = float_list.index(max_value)
    return semilist[max_index]

def process_image(image):
    rgba_image = image.convert("RGBA")
    switched_data = [
        (255, 255, 255, pixel[3]) if pixel[:3] == (0, 0, 0) else (0, 0, 0, pixel[3]) if pixel[:3] == (255, 255, 255) else pixel
        for pixel in rgba_image.getdata()
    ]
    switched_image = Image.new("RGBA", rgba_image.size)
    switched_image.putdata(switched_data)
    final_data = [
        (0, 0, 0, 0) if pixel[:3] == (255, 255, 255) else pixel
        for pixel in switched_image.getdata()
    ]
    processed_image = Image.new("RGBA", rgba_image.size)
    processed_image.putdata(final_data)
    return processed_image

@spaces.GPU()
def processAndGetMask(image: str, text: str):
    datas = getImageDetails(image)
    labs = list(datas.keys())
    selector = processSentence(text, labs)
    imageout = datas[selector]
    print(f"Selected : {selector} Among : {labs}")
    return process_image(imageout)

gr = gradio.Interface(
    processAndGetMask,
    [gradio.Image(label="Input Image", type="pil"), gradio.Text(label="Input text to segment")],
    gradio.Image(label="Output Image", type="pil")
)
gr.launch(share=True)